Name: Huimiao Chen, JHED ID: hchen185, Email: hchen185@jhu.edu.
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import random
# load data and pre-process
## data urls
url_1 = "https://raw.githubusercontent.com/smart-stats/ds4bio_book/main/book/assetts/kirby21AllLevels.csv"
url_2 = "https://raw.githubusercontent.com/bcaffo/MRIcloudT1volumetrics/master/inst/extdata/multilevel_lookup_table.txt"
## load in the hierarchy information
multilevel_lookup = pd.read_csv(url_2, sep = "\t").drop(['Level5'], axis = 1)
multilevel_lookup = multilevel_lookup.rename(columns = {
"modify" : "roi",
"modify.1" : "level4",
"modify.2" : "level3",
"modify.3" : "level2",
"modify.4" : "level1"})
multilevel_lookup = multilevel_lookup[['roi', 'level4', 'level3', 'level2', 'level1']]
## load in the subject data
id = 127
subjectData = pd.read_csv(url_1)
subjectData = subjectData.loc[(subjectData.type == 1) & (subjectData.level == 5) & (subjectData.id == id)]
subjectData = subjectData[['roi', 'volume']]
## merge the subject data with the multilevel data
subjectData = pd.merge(subjectData, multilevel_lookup, on = "roi")
subjectData = subjectData.assign(icv = "ICV")
subjectData = subjectData.assign(comp = subjectData.volume / np.sum(subjectData.volume))
## print data
print(subjectData)
# prepare data for Sankey diagram
## initialize an empty dictionary
data_dict = {}
## prepare node names and colors
data_dict["node"] = {"label": [], "color": []}
### prepare node names
cols = ["icv", "level1", "level2", "level3", "level4", "roi"]
node_matrix = subjectData.loc[:, cols].values
for i in range(len(node_matrix)): # add level names as prefixes to avoid same names from different levels
for j, level in enumerate(cols):
node_matrix[i][j] = level + ":" + node_matrix[i][j] # the prefixes will be deleted later
node_names = node_matrix.flatten(order='F').tolist()
data_dict["node"]["label"] = list(set(node_names))
### prepare node colors
opacity_node = 0.8
colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), opacity_node)
for i in range(len(data_dict["node"]["label"]))] # generate RGBA color tuples with opacity
color_strings = ['rgba({}, {}, {}, {})'.format(r, g, b, a)
for r, g, b, a in colors] # convert the RGBA color tuples to a list of RGBA color strings
data_dict["node"]["color"] = color_strings
## prepare link sources, targets, values, and colors
data_dict["link"] = {"source": [], "target": [], "value": [], "color": []}
label_No = {} # initialize node numbers dict
for index_, label_ in enumerate(data_dict["node"]["label"]):
label_No[label_] = index_ # assign numbers to nodes
cols = ["icv", "level1", "level2", "level3", "level4", "roi", "comp"]
subjectData_new = subjectData.loc[:, cols]
for i in subjectData_new.index:
for j in subjectData_new.columns[1:-1]: # exclusive of "icv" and "comp"
source_name = cols[cols.index(j) - 1] + ":" + subjectData_new.loc[i, cols[cols.index(j) - 1]]
source_number = label_No[source_name]
target_name = j + ":" + subjectData_new.loc[i, j]
target_number = label_No[target_name]
value = subjectData_new.loc[:, cols].groupby(j).sum().loc[subjectData_new.loc[i, j],"comp"]
if_break = False # begin check whether the data has been stored
for a, b, c in zip(data_dict["link"]["source"], data_dict["link"]["target"], data_dict["link"]["value"]):
if a == source_number and b == target_number and c == value:
if_break = True
if if_break:
pass
else:
data_dict["link"]["source"].append(source_number)
data_dict["link"]["target"].append(target_number)
data_dict["link"]["value"].append(value)
opacity_link = 0.4
data_dict["link"]["color"] = [data_dict["node"]["color"][src].replace(str(opacity_node), str(opacity_link))
for src in data_dict["link"]["source"]]
for index_, label_ in enumerate(data_dict["node"]["label"]): # the prefixes of node names are deleted
data_dict["node"]["label"][index_] = label_.split(":")[1]
# plot Sankey diagram
fig = go.Figure(data=[go.Sankey(
# valueformat = ".0000f",
valuesuffix = "(comp)",
# Define nodes
node = dict(
pad = 15,
thickness = 15,
line = dict(color = "black", width = 0.5),
label = data_dict['node']['label'],
color = data_dict['node']['color']
),
# Add links
link = dict(
source = data_dict['link']['source'],
target = data_dict['link']['target'],
value = data_dict['link']['value'],
color = data_dict['link']['color']
))])
fig.update_layout(title_text="Structure of the intracranial volume<br>Source: Multi-level MRICloud data: <a href='https://raw.githubusercontent.com/smart-stats/ds4bio_book/main/book/assetts/kirby21AllLevels.csv'>kirby21AllLevels</a>",
font_size=10)
fig.show()
roi volume level4 level3 \
0 SFG_L 12926 SFG_L Frontal_L
1 SFG_R 10050 SFG_R Frontal_R
2 SFG_PFC_L 12783 SFG_L Frontal_L
3 SFG_PFC_R 11507 SFG_R Frontal_R
4 SFG_pole_L 3078 SFG_L Frontal_L
.. ... ... ... ...
275 Chroid_LVetc_L 444 AnteriorLateralVentricle_L LateralVentricle_L
276 Chroid_LVetc_R 371 AnteriorLateralVentricle_R LateralVentricle_R
277 IV_ventricle 2700 IV_ventricle IV_ventricle
278 ECCL_L 292 inf_DPWM_L InferiorWM_L
279 ECCL_R 292 inf_DPWM_R InferiorWM_R
level2 level1 icv comp
0 CerebralCortex_L Telencephalon_L ICV 0.009350
1 CerebralCortex_R Telencephalon_R ICV 0.007270
2 CerebralCortex_L Telencephalon_L ICV 0.009247
3 CerebralCortex_R Telencephalon_R ICV 0.008324
4 CerebralCortex_L Telencephalon_L ICV 0.002227
.. ... ... ... ...
275 Ventricle CSF ICV 0.000321
276 Ventricle CSF ICV 0.000268
277 Ventricle CSF ICV 0.001953
278 WhiteMatter_L Telencephalon_L ICV 0.000211
279 WhiteMatter_R Telencephalon_R ICV 0.000211
[280 rows x 8 columns]
fig.write_html(file="HW4Q1.html")
opioid.db that has the data. Next, read the three tables into pandas dataframes and do the data wrangling from the sqlite chapter directly in pandas. Add the python code to your hw4.ipynb file.
import plotly.express as px